{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "b050edae-48cf-47ba-b2ae-bf9fc9098bb4",
   "metadata": {},
   "source": [
    "## Sentence Order Shuffling for Text Augmentation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0ab61d1f-466f-4469-99da-b172a959c2cb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Author: Sebastian Raschka\n",
      "\n",
      "Python implementation: CPython\n",
      "Python version       : 3.10.6\n",
      "IPython version      : 8.12.0\n",
      "\n"
     ]
    }
   ],
   "source": [
    "%load_ext watermark\n",
    "%watermark -a 'Sebastian Raschka' -v"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "1f2926e6-aeac-4a9b-b762-f8a4a891d5b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "\n",
    "\n",
    "def word_position_swapping(sentence, swapping_rate=0.1):\n",
    "    words = sentence.split()\n",
    "    num_swaps = int(len(words) * swapping_rate)\n",
    "\n",
    "    for _ in range(num_swaps):\n",
    "        # Select two random indices to swap\n",
    "        index1, index2 = random.sample(range(len(words)), 2)\n",
    "\n",
    "        # Swap the words at the selected indices\n",
    "        words[index1], words[index2] = words[index2], words[index1]\n",
    "\n",
    "    return \" \".join(words)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4f3bfef8-58ca-45c8-9762-9be6b23b06ef",
   "metadata": {},
   "source": [
    "**Random deletion with a 20% swapping rate**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "1820de77-bd90-4247-a7d3-1ff7dc429ff4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The brown quick fox jumped over the lazy dog.\n"
     ]
    }
   ],
   "source": [
    "random.seed(1)\n",
    "sentence = \"The quick brown fox jumped over the lazy dog.\"\n",
    "augmented_sentence = word_position_swapping(sentence, swapping_rate=0.2)\n",
    "print(augmented_sentence)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "47edf286-4825-4274-94b4-d10423767646",
   "metadata": {},
   "source": [
    "**Show difference before and after**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "823a4b6b-5c33-4fd5-ade7-3b718d902ad4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  The\n",
      "+ brown\n",
      "  quick\n",
      "- brown\n",
      "  fox\n",
      "  jumped\n",
      "  over\n",
      "  the\n",
      "  lazy\n",
      "  dog.\n"
     ]
    }
   ],
   "source": [
    "import difflib\n",
    "\n",
    "\n",
    "d = difflib.Differ()\n",
    "diff = d.compare(sentence.split(), augmented_sentence.split())\n",
    "\n",
    "print('\\n'.join(diff))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
